Code Author: Saf Flatters
Data Source: Ballarat Wind Observations Dataset
# install.packages(c(
# "fitdistrplus",
# "tidyverse",
# "forecast",
# "ggplot2",
# "tseries",
# "lubridate",
# "weibullness",
# "TTR",
# "plotly"
# ))
# Load libraries
library(fitdistrplus) # For Weibull distribution fitting
## Warning: package 'fitdistrplus' was built under R version 4.4.3
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.4.3
## Loading required package: survival
library(tidyverse) # Multiple uses
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(forecast) # For ARIMA modelling
## Warning: package 'forecast' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2) # For plotting
library(tseries) # For stationarity tests
## Warning: package 'tseries' was built under R version 4.4.3
library(lubridate) # For date-time manipulation
library(weibullness) # Goodness of fit testing of Weibull MLE
##
## weibullness Package is installed.
library(TTR) # Identifying trends with 7-day smoothing
## Warning: package 'TTR' was built under R version 4.4.3
library(stats4) # testing non standard MLEs
library(plotly) # interactive ggplots
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Import data
wind <- read_csv("wind-observations_ballarat.csv")
## Rows: 60416 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): location_description, wind_direction_cardinal, point, polar
## dbl (7): latitude, longitude, average_wind_speed, gust_speed, wind_directio...
## dttm (1): date_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(wind)
## # A tibble: 6 × 12
## date_time location_description latitude longitude average_wind_speed
## <dttm> <chr> <dbl> <dbl> <dbl>
## 1 2024-01-04 20:37:30 Rowing Course -37.6 144. 6.45
## 2 2024-01-04 21:22:17 Rowing Course -37.6 144. 5.39
## 3 2024-01-04 23:23:33 Rowing Course -37.6 144. 3.15
## 4 2024-01-04 23:53:11 Rowing Course -37.6 144. 2.97
## 5 2024-01-05 02:54:10 Rowing Course -37.6 144. 2.99
## 6 2024-01-05 06:23:45 Rowing Course -37.6 144. 7.11
## # ℹ 7 more variables: gust_speed <dbl>, wind_direction <dbl>,
## # wind_direction_cardinal <chr>, point <chr>, wind_speed_average <dbl>,
## # wind_speed_gust <dbl>, polar <chr>
str(wind)
## spc_tbl_ [60,416 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date_time : POSIXct[1:60416], format: "2024-01-04 20:37:30" "2024-01-04 21:22:17" ...
## $ location_description : chr [1:60416] "Rowing Course" "Rowing Course" "Rowing Course" "Rowing Course" ...
## $ latitude : num [1:60416] -37.6 -37.6 -37.6 -37.6 -37.6 ...
## $ longitude : num [1:60416] 144 144 144 144 144 ...
## $ average_wind_speed : num [1:60416] 6.45 5.39 3.15 2.97 2.99 7.11 4.65 4.12 4.63 2.2 ...
## $ gust_speed : num [1:60416] 10.27 7.6 4.74 4.86 5.07 ...
## $ wind_direction : num [1:60416] 107 99.9 81.2 74.1 152.8 ...
## $ wind_direction_cardinal: chr [1:60416] "ESE" "E" "E" "ENE" ...
## $ point : chr [1:60416] "-37.554098, 143.824685" "-37.554098, 143.824685" "-37.554098, 143.824685" "-37.554098, 143.824685" ...
## $ wind_speed_average : num [1:60416] 23.2 19.4 11.3 10.7 10.8 ...
## $ wind_speed_gust : num [1:60416] 37 27.4 17.1 17.5 18.3 ...
## $ polar : chr [1:60416] "06 ESE" "05 E" "05 E" "04 ENE" ...
## - attr(*, "spec")=
## .. cols(
## .. date_time = col_datetime(format = ""),
## .. location_description = col_character(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. average_wind_speed = col_double(),
## .. gust_speed = col_double(),
## .. wind_direction = col_double(),
## .. wind_direction_cardinal = col_character(),
## .. point = col_character(),
## .. wind_speed_average = col_double(),
## .. wind_speed_gust = col_double(),
## .. polar = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Plot raw wind speed over time
wind <- wind[order(wind$date_time), ] # put in date order
plot(wind$date_time,
wind$average_wind_speed,
type = "l",
xlab = "Date/Time",
ylab = "Wind Speed (m/s)",
main = "Wind Speed Over Time - Raw Data")
I examined the observational density to understand how consistently wind speed measurements were recorded over time as it appeared to be inconsistent. This helped me plan how to aggregate the data for modelling.
# Convert date_time column to proper datetime format and filter to one year of data
wind <- wind %>%
mutate(date_time = ymd_hms(date_time)) %>%
filter(date_time >= as.POSIXct("2023-04-01") & date_time < as.POSIXct("2024-04-01"))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date_time = ymd_hms(date_time)`.
## Caused by warning:
## ! 1 failed to parse.
# Calculate number of observations per hour
observations_per_hour <- wind %>%
mutate(hourly_time = floor_date(date_time, "hour")) %>%
group_by(hourly_time) %>%
summarise(obs_count = n())
# Plot frequency of observation counts
ggplot(observations_per_hour,
aes(x = obs_count)) +
geom_bar(fill = "skyblue",
color = "black") +
labs(title = "Frequency of Observations per Hour",
x = "Observations per Hour",
y = "Number of Hours") +
theme_minimal()
Cleaned data: removed not-required columns, stripped
date_time column to convert entries into POSIXct
objects.
Filtered data: removed data from outside the target period of April 2023 to March 2024 to have a consistent one-year time window for analysis.
Aggregated data: averaged wind speed measurements by hour using the floor of each timestamp to create a regular, evenly spaced time series suitable for ARIMA modelling
# put data in date/time order
wind <- wind[order(wind$date_time), ]
# remove unrequired columns
wind$location_description <- NULL
wind$latitude <- NULL
wind$longitude <- NULL
# Convert date_time
wind$date_time <- sub("\\+00:00$", "", wind$date_time) # Remove the "+00:00" timezone suffix
wind$date_time <- ymd_hms(wind$date_time) # Convert to POSIXct datetime objects
# rename columns
names(wind)[names(wind) == "average_wind_speed"] <- "wind_speed"
# Keep only date_time and wind_speed
wind <- wind[ , 1:2]
# Filter to April 2023-April 2024
wind <- wind %>%
mutate(date_time = ymd_hms(date_time)) %>%
filter(date_time >= as.POSIXct("2023-04-01") & date_time < as.POSIXct("2024-04-01"))
# Aggregate the data to hourly averages:
wind_hourly <- wind %>%
mutate(hourly_time = floor_date(date_time, "hour")) %>% # Round down each timestamp to the hour
group_by(hourly_time) %>% # Group by the hourly time
summarise(
wind_speed = mean(wind_speed, na.rm = TRUE), # Compute the mean wind speed for each hour
) %>%
ungroup()
# Check for any missing values in hourly_time column
cat("Missing Values: ")
## Missing Values:
sum(is.na(wind_hourly$hourly_time))
## [1] 0
wind_speed_data <- wind_hourly
head(wind_speed_data, 24)
## # A tibble: 24 × 2
## hourly_time wind_speed
## <dttm> <dbl>
## 1 2023-03-31 16:00:00 1.96
## 2 2023-03-31 17:00:00 2.71
## 3 2023-03-31 18:00:00 3.17
## 4 2023-03-31 19:00:00 2.9
## 5 2023-03-31 20:00:00 3.46
## 6 2023-03-31 21:00:00 3.68
## 7 2023-03-31 22:00:00 3.62
## 8 2023-03-31 23:00:00 4.82
## 9 2023-04-01 00:00:00 6.40
## 10 2023-04-01 01:00:00 5.95
## # ℹ 14 more rows
Plot whole timeseries to look for seasonality, trends, anomalies
Plot 7 day smoothed timeseries to look for longer-term trends
#Plot whole timeseries to look for seasonality, trends, anomalies
# Set frequency assuming hourly data (24 obs per day)
ts_wind <- ts(wind_speed_data$wind_speed,
start = c(2023, 16))
# Plot the time series
autoplot(ts_wind) +
ggtitle("Wind Speed Time Series")+
xlab("Elapsed Time (Hourly") +
ylab("Wind Speed (m/s)")
# Plot 7 day smoothed timeseries to look for longer-term trends
# Apply a rolling mean to highlight trend shifts (7-day window)
wind_speed_data$trend <- SMA(wind_speed_data$wind_speed,
n=24*7) # 7-day smoothing
# Plot the smoothed trend
ggplot(wind_speed_data, aes(x=hourly_time, y=trend)) +
geom_line(color="blue") +
ggtitle("Smoothed Wind Speed Trend (7-day Moving Average)") +
xlab("Elapsed Time (Hourly)") +
ylab("Wind Speed (m/s")
## Warning: Removed 167 rows containing missing values or values outside the scale range
## (`geom_line()`).
Weibull distribution to the observed hourly wind speed data using Maximum Likelihood Estimation (MLE)
set.seed(123) # reproducibility
wind_sample <- sample(wind_speed_data$wind_speed, 1000) # can only have up to 1000 samples for test so randomly sampled from observations
# weibullness test - Null Hypothesis is data follows Weibull
wp_test_result <- wp.test(wind_sample)
wp_test_result
##
## Weibullness test from the Weibull plot
##
## data: wind_sample
## correlation = 0.99727278, p-value = 0.1133085
# Fit a Weibull distribution to the wind speed data of entire year
annual_weibull_fit <- fitdist(wind_speed_data$wind_speed,
"weibull",
method = "mle")
# Extract parameters
shape_param <- annual_weibull_fit$estimate["shape"]
scale_param <- annual_weibull_fit$estimate["scale"]
# Plot the fitted Weibull distribution
ggplot(wind_speed_data, aes(x = wind_speed)) +
geom_histogram(aes(y = ..density..),
bins = 30,
fill = "blue",
alpha = 0.5) +
stat_function(fun = dweibull,
args = list(shape = shape_param,
scale = scale_param),
color = "red",
size = 1) +
ggtitle("Fitted Weibull Distribution for Wind Speed") +
xlab("Wind Speed (m/s)") +
ylab("Density")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
annual_weibull_fit
## Fitting of the distribution ' weibull ' by maximum likelihood
## Parameters:
## estimate Std. Error
## shape 2.017811103 0.01674106814
## scale 5.136559196 0.02885195187
# Analyse fit against weibull dist
qqcomp(list(annual_weibull_fit),
legendtext = "Weibull")
# add season column to dataframe
wind_speed_data <- wind_speed_data %>%
mutate(
month = month(hourly_time),
season = case_when(
month %in% c(12, 1, 2) ~ "Summer",
month %in% c(3, 4, 5) ~ "Autumn",
month %in% c(6, 7, 8) ~ "Winter",
month %in% c(9, 10, 11) ~ "Spring"
)
)
# fit weibull to each season separately
seasonal_params <- wind_speed_data %>%
group_by(season) %>%
summarise(
shape = fitdist(wind_speed, "weibull", method = "mle")$estimate["shape"],
scale = fitdist(wind_speed, "weibull", method = "mle")$estimate["scale"],
.groups = "drop"
)
## Warning: There were 16 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `shape = fitdist(wind_speed, "weibull", method =
## "mle")$estimate["shape"]`.
## ℹ In group 1: `season = "Autumn"`.
## Caused by warning in `dweibull()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 15 remaining warnings.
seasonal_params
## # A tibble: 4 × 3
## season shape scale
## <chr> <dbl> <dbl>
## 1 Autumn 1.79 4.38
## 2 Spring 2.18 5.35
## 3 Summer 2.45 5.44
## 4 Winter 1.88 5.34
# plot 4 seasons PDF with red theoretical PDF
# Create Weibull PDF data frame
pdf_data <- wind_speed_data %>%
group_by(season) %>%
summarise(x = list(seq(min(wind_speed),
max(wind_speed),
length.out = 300))) %>%
unnest(x) %>%
left_join(seasonal_params,
by = "season") %>%
mutate(pdf = dweibull(x,
shape = shape,
scale = scale))
ggplot(wind_speed_data, aes(x = wind_speed, fill = season)) +
geom_density(alpha = 0.4) +
geom_line(data = pdf_data,
aes(x = x, y = pdf),
colour = "red",
inherit.aes = FALSE) +
facet_wrap(~season, scales = "free_y") +
labs(title = "Wind Speed Distributions by Season",
x = "Wind Speed (m/s)",
y = "Density") +
theme_minimal()
# plot QQ plots for each season
# Generate theoretical Weibull quantiles for each row using seasonal shape/scale
qq_data <- wind_speed_data %>%
left_join(seasonal_params, by = "season") %>%
group_by(season) %>%
arrange(wind_speed) %>%
mutate(
n = n(),
p = ppoints(n), # generate uniform probs for quantiles
weibull_q = qweibull(p,
shape = shape,
scale = scale)
) %>%
ungroup()
# Plot Q–Q plots by season
ggplot(qq_data,
aes(x = weibull_q,
y = wind_speed,
colour = season)) +
geom_point(alpha = 0.4) +
geom_abline(slope = 1,
intercept = 0,
colour = "black") +
facet_wrap(~season,
scales = "free") +
labs(title = "Q–Q Plots of Weibull Fits by Season",
x = "Theoretical Quantiles (Weibull)",
y = "Empirical Quantiles (Observed Wind Speed)") +
theme_minimal() +
theme(legend.position = "none")
# store each season model as fit object
seasonal_fits <- wind_speed_data %>%
group_by(season) %>%
summarise(
fit = list(fitdist(wind_speed, "weibull")), # MLE fit for each season
.groups = "drop"
)
## Warning: There were 8 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `fit = list(fitdist(wind_speed, "weibull"))`.
## ℹ In group 1: `season = "Autumn"`.
## Caused by warning in `dweibull()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 7 remaining warnings.
# Extract AICs and log-likelihoods for ANNUAL
loglik_annual <- logLik(annual_weibull_fit)
aic_annual <- AIC(annual_weibull_fit)
# Extract AICs and log-likelihoods for each season
seasonal_fits <- seasonal_fits %>%
mutate(
loglik = map_dbl(fit, ~ as.numeric(logLik(.))),
aic = map_dbl(fit, AIC)
)
# Sum the seasonal log-likelihoods and AICs, and label as "Seasonal" model
comparison <- seasonal_fits %>%
summarise(
total_loglik = sum(loglik),
total_aic = sum(aic)
) %>%
mutate(
model = "Seasonal"
) %>%
# Add the annual model results to comparison
bind_rows(tibble(
model = "Annual",
total_loglik = as.numeric(loglik_annual),
total_aic = aic_annual
))
comparison
## # A tibble: 2 × 3
## total_loglik total_aic model
## <dbl> <dbl> <chr>
## 1 -19016. 38048. Seasonal
## 2 -19215. 38434. Annual
# Simulate seasonal wind speeds using seasonal Weibull parameters
set.seed(123) # Reproducibility
# Join fitted shape/scale to the original data
wind_seasonal <- wind_speed_data %>%
left_join(seasonal_params,
by = "season")
# Simulate wind speeds per row using the season's parameters
wind_seasonal <- wind_seasonal %>%
rowwise() %>%
mutate(wind_speed = rweibull(1,
shape = shape,
scale = scale)) %>%
ungroup()
# Store simulated values in a separate data frame
simulated_data <- wind_seasonal %>%
select(season, wind_speed)
# Compare summary statistics
cat("Simulated Data:\n")
## Simulated Data:
summary(simulated_data$wind_speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06178181 2.82084129 4.34837356 4.57010047 6.03621303 15.09884434
cat("\nActual Data:\n")
##
## Actual Data:
summary(wind_speed_data$wind_speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.310000 2.780000 4.270000 4.545675 5.948750 15.142500
# Plot simulated vs actual wind speed distributions
ggplot() +
geom_density(data = wind_speed_data,
aes(x = wind_speed,
color = "Actual Wind Speed"),
size = 1) +
geom_density(data = simulated_data,
aes(x = wind_speed,
color = "Simulated Wind Speed"),
size = 1,
linetype = "dashed") +
ggtitle("Comparison: Simulated vs. Actual Wind Speed Distributions") +
xlab("Wind Speed (m/s)") +
ylab("Density") +
scale_color_manual(values = c("Actual Wind Speed" = "blue",
"Simulated Wind Speed" = "tomato"))
# data frame of quantiles
qq_df <- data.frame(
actual = sort(wind_speed_data$wind_speed),
simulated = sort(simulated_data$wind_speed)
)
# Q–Q plot
ggplot(qq_df, aes(x = actual,
y = simulated)) +
geom_point(alpha = 0.4,
colour = "purple") +
geom_abline(slope = 1,
intercept = 0,
colour = "black") +
labs(
title = "Q–Q Plot: Actual vs Simulated Wind Speed",
x = "Actual Quantiles",
y = "Simulated Quantiles"
) +
theme_minimal(base_size = 12)
# convert to Timeseries
ts_wind <- ts(wind_speed_data$wind_speed,
frequency = 24)
# plot Autocorrelation to look for seasonal patterns
ggAcf(ts_wind, lag.max = 200) +
ggtitle("Autocorrelation Function (ACF) of Hourly Wind Speed")
# Test for stationarity
adf_test <- adf.test(ts_wind)
## Warning in adf.test(ts_wind): p-value smaller than printed p-value
adf_test
##
## Augmented Dickey-Fuller Test
##
## data: ts_wind
## Dickey-Fuller = -13.291272, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# find model params with auto.arima
fit_sarima <- auto.arima(ts_wind, seasonal = TRUE)
summary(fit_sarima)
## Series: ts_wind
## ARIMA(0,1,1)(0,0,2)[24]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1123117 0.0587036 0.0510640
## s.e. 0.0108745 0.0107938 0.0103651
##
## sigma^2 = 0.7790499: log likelihood = -11197.73
## AIC=22403.47 AICc=22403.47 BIC=22431.73
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 0.0001938985014 0.8824340356 0.6463653384 -3.135159909 18.52251643
## MASE ACF1
## Training set 0.2814236465 -0.001578571712
checkresiduals(fit_sarima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,0,2)[24]
## Q* = 209.24384, df = 45, p-value < 2.2204e-16
##
## Model df: 3. Total lags used: 48
This is commented out due to excessive computational time - result of this has been put in manually for shorter test time
# # find model params with auto.arima without stepwise and approx
# fit_sarima_noapprox <- auto.arima(ts_wind,
# stepwise = FALSE,
# approximation = FALSE)
# # result: ARIMA(2,1,0)(2,0,0)[24]
# check fit of
# summary(fit_sarima_noapprox)
# checkresiduals(fit_sarima_noapprox)
# find model params with auto.arima without stepwise and approx result manual
fit_sarima_noapprox <- Arima(ts_wind,
order = c(2, 1, 0), # Non-seasonal: AR(2), I(1), MA(0)
seasonal = list(order = c(2, 0, 0), # Seasonal: SAR(2), SD(0), SMA(0)
period = 24))
fit_sarima_noapprox
## Series: ts_wind
## ARIMA(2,1,0)(2,0,0)[24]
##
## Coefficients:
## ar1 ar2 sar1 sar2
## 0.1100981 -0.0277679 0.0632079 0.0549474
## s.e. 0.0068348 0.0027715 0.0103903 0.0029208
##
## sigma^2 = 0.7782804: log likelihood = -11192.86
## AIC=22395.72 AICc=22395.72 BIC=22431.05
checkresiduals(fit_sarima_noapprox)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,0)(2,0,0)[24]
## Q* = 210.34591, df = 44, p-value < 2.2204e-16
##
## Model df: 4. Total lags used: 48
# ACF and PACF with No differencing
ggtsdisplay(ts_wind, lag = 24,
main = "Raw - no differencing")
# ACF and PACF with differencing
ggtsdisplay(diff(ts_wind), lag = 24,
main = "First Difference")
# Function to allow for SARIMA manual combinations: returns model and residual check
fit_sarima <- function(ts_data,
order = c(0, 0, 0),
seasonal_order = c(0, 0, 0),
seasonal_period = 24) {
model <- Arima(ts_data,
order = order,
seasonal = list(order = seasonal_order,
period = seasonal_period))
model
checkresiduals(model)
return(model)
}
# This code block is used to test multiple combinations of SARIMA and check AIC and residual ACF
fit_sarima(ts_wind,
order = c(2, 1, 2),
seasonal_order = c(1, 0, 1))
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(1,0,1)[24]
## Q* = 85.210873, df = 42, p-value = 9.131858e-05
##
## Model df: 6. Total lags used: 48
## Series: ts_data
## ARIMA(2,1,2)(1,0,1)[24]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## 1.5004928 -0.5492569 -1.4673250 0.4675550 0.6616748 -0.5939269
## s.e. 0.0497728 0.0457704 0.0550776 0.0550052 0.0664114 0.0686531
##
## sigma^2 = 0.7411406: log likelihood = -10982.39
## AIC=21978.78 AICc=21978.8 BIC=22028.24
# best model found from Model Selection sections
bestmodel <- fit_sarima(ts_wind,
order = c(2,1,2),
seasonal_order = c(1, 0, 1))
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(1,0,1)[24]
## Q* = 85.210873, df = 42, p-value = 9.131858e-05
##
## Model df: 6. Total lags used: 48
bestmodel
## Series: ts_data
## ARIMA(2,1,2)(1,0,1)[24]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sar1 sma1
## 1.5004928 -0.5492569 -1.4673250 0.4675550 0.6616748 -0.5939269
## s.e. 0.0497728 0.0457704 0.0550776 0.0550052 0.0664114 0.0686531
##
## sigma^2 = 0.7411406: log likelihood = -10982.39
## AIC=21978.78 AICc=21978.8 BIC=22028.24
# forecast next 48 hours using SARIMA best model and plot
forecast_wind <- forecast(bestmodel, h = 48)
autoplot(forecast_wind) +
ggtitle("SARIMA Forecast for Wind Speed") +
ylab("Wind Speed (m/s)") +
xlab("Time") +
theme_minimal()
# Final Presentation plot of Forecast
# For better visualisation - zoom up on last 10 days
# Extract the last 240 observations (10 days)
last_10days <- window(forecast_wind$x, start = tail(time(forecast_wind$x), 240)[1])
# Recompute the forecast model based only on the last 10 days
forecast_limited <- forecast(last_10days, model = forecast_wind$bestmodel)
# Define number of points
n_obs <- length(forecast_limited$x)
n_forecast <- length(forecast_limited$mean)
total_points <- n_obs + n_forecast
# Create x-axis labels: Day 1–10 (observed), F1–F2 (forecast)
x_labels <- c(
rep(paste0(10:1)),
rep(c("F1", "F2"))
)
# Build data frame and ensure prediction intervals are not below 0
df <- data.frame(
time_index = 1:total_points,
value = c(as.numeric(forecast_limited$x), as.numeric(forecast_limited$mean)),
type = c(rep("Observed", n_obs), rep("Forecast", n_forecast)),
upper_80 = c(rep(NA, n_obs), forecast_limited$upper[, "80%"]),
lower_80 = c(rep(NA, n_obs), pmax(0, forecast_limited$lower[, "80%"])),
upper_95 = c(rep(NA, n_obs), forecast_limited$upper[, "95%"]),
lower_95 = c(rep(NA, n_obs), pmax(0, forecast_limited$lower[, "95%"]))
)
# For legend
ribbon_data <- rbind(
data.frame(time_index = (n_obs + 1):total_points,
ymin = df$lower_80[(n_obs + 1):total_points],
ymax = df$upper_80[(n_obs + 1):total_points],
Interval = "80% PI"),
data.frame(time_index = (n_obs + 1):total_points,
ymin = df$lower_95[(n_obs + 1):total_points],
ymax = df$upper_95[(n_obs + 1):total_points],
Interval = "95% PI")
)
# Plot with both 80% and 95% prediction intervals
p <- ggplot() +
geom_ribbon(data = ribbon_data,
aes(x = time_index, ymin = ymin, ymax = ymax, fill = Interval),
alpha = 0.3) +
geom_line(data = df, aes(x = time_index, y = value, colour = type)) +
scale_fill_manual(values = c("80% PI" = "blue", "95% PI" = "blue")) +
scale_colour_manual(values = c("Observed" = "#00BFC4", "Forecast" = "#F8766D")) +
guides(fill = guide_legend(title = NULL), # Remove fill legend title
colour = guide_legend(title = NULL)) + # Remove colour legend title
scale_x_continuous(
breaks = seq(12, total_points, by = 24),
labels = unique(x_labels)
) +
ylab("Wind Speed (m/s)") +
xlab("Days before Forecast") +
labs(
title = "SARIMA 48 Hour Forecast for Wind Speed\nPrevious 10 Days + Forecast Days (F1, F2)"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12)
)
ggplotly(p) # interactive plot
Separate the Seasons: Meteorological Season-specific SARIMA models
ARIMAX: ARIMA model with seasonal dummies as a covariate
# Filter to 1 season only
separate_data <- wind_speed_data %>% filter(season == "Summer")
ts_separate <- ts(separate_data$wind_speed, frequency = 24)
autoplot(ts_separate) +
ggtitle(paste("Wind Speed Time Series -", wind_speed_data$season[1]))
# Fit SARIMA
fit_separate <- auto.arima(ts_separate, seasonal = TRUE)
fit_separate
## Series: ts_separate
## ARIMA(2,0,1)(0,0,2)[24] with non-zero mean
##
## Coefficients:
## ar1 ar2 ma1 sma1 sma2 mean
## 1.5400995 -0.602653 -0.5402378 0.0663630 0.0356565 4.8115775
## s.e. 0.1014453 0.089210 0.1123488 0.0217245 0.0209964 0.1576854
##
## sigma^2 = 0.8232304: log likelihood = -2842.17
## AIC=5698.34 AICc=5698.39 BIC=5738.06
# Check residuals
checkresiduals(fit_separate)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,1)(0,0,2)[24] with non-zero mean
## Q* = 60.5658, df = 43, p-value = 0.0397039
##
## Model df: 5. Total lags used: 48
Further study here in ARIMAX used this: https://robjhyndman.com/hyndsight/arimax/
# 1. Encode seasons and make dummy matrix
# Ensure season is a factor
wind_speed_data$season <- factor(wind_speed_data$season, levels = c("Summer", "Autumn", "Winter", "Spring"))
# Create dummy variables using model.matrix (excluding 1 to avoid multicollinearity)
season_dummies <- model.matrix(~ season, data = wind_speed_data)[, -1] # drops intercept (Summer is baseline)
# Check the dummy matrix
head(season_dummies)
## seasonAutumn seasonWinter seasonSpring
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 0
# 2. fit ARIMAX with covariate (exogenous regressors - encoded seasons)
ts_wind <- ts(wind_speed_data$wind_speed, frequency = 24) # daily frequency
# commented out due to excessive computational time. Results ARIMA(2,0,1)(2,0,0)
# Fit SARIMA with seasonal dummy variables as exogenous regressors
# ARIMAX_sea <- auto.arima(ts_wind,
# xreg = season_dummies,
# seasonal = TRUE,
# stepwise = FALSE,
# approximation = FALSE)
# auto.arima with xreg = season_dummies chosen model:
ARIMAX_sea <- Arima(ts_wind,
order = c(2, 0, 1),
seasonal = list(order = c(2, 0, 0), period = 24),
xreg = season_dummies) # exogenous regressors
summary(ARIMAX_sea)
## Series: ts_wind
## Regression with ARIMA(2,0,1)(2,0,0)[24] errors
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2 intercept
## 1.3318332 -0.3958826 -0.2762553 0.0625750 0.0514319 4.6655589
## s.e. 0.0812837 0.0749536 0.0860459 0.0107758 0.0107867 0.2226704
## seasonAutumn seasonWinter seasonSpring
## -0.6015890 0.0393508 0.0836923
## s.e. 0.3053598 0.3117849 0.3040864
##
## sigma^2 = 0.742242: log likelihood = -10987.59
## AIC=21995.19 AICc=21995.21 BIC=22065.84
##
## Training set error measures:
## ME RMSE MAE MPE
## Training set -8.563043744e-05 0.8610866141 0.6366005467 -6.269189817
## MAPE MASE ACF1
## Training set 19.09981615 0.2771721139 0.004120957358
checkresiduals(ARIMAX_sea)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(2,0,1)(2,0,0)[24] errors
## Q* = 87.287061, df = 43, p-value = 7.599244e-05
##
## Model df: 5. Total lags used: 48